from easy_io import write_pkl_file
import pandas as pd
import random


def main():
    foldlist = {}
    random.seed(2112)
    scanids = list(pd.read_csv('files/stage1_labels.csv').values[:, 0])
    scanids.extend(list(pd.read_csv('files/stage1_test_labels.csv').values[:, 0]))
    cancerlist1 = dict(pd.read_csv('files/stage1_labels.csv').values)
    cancerlist2 = dict(pd.read_csv('files/stage1_test_labels.csv').values)
    cancerlist = dict(cancerlist1, **cancerlist2)
    random.shuffle(scanids)
    # print(scanids)
    for i, scanid in enumerate([scanid for scanid in scanids if cancerlist[scanid] == 0]):
        foldlist[scanid] = {'fold': i % 4, 'cancer': cancerlist[scanid]}
    for i, scanid in enumerate([scanid for scanid in scanids if cancerlist[scanid] == 1]):
        foldlist[scanid] = {'fold': i % 4, 'cancer': cancerlist[scanid]}
    # print(foldlist)
    for scanid in scanids[:10]:
        print(foldlist[scanid]['fold'])
    for ifold in range(4):
        num_cancer = len([x for x in foldlist if foldlist[x]['fold'] == ifold and foldlist[x]['cancer'] == 1])
        num_nocancer = len([x for x in foldlist if foldlist[x]['fold'] == ifold and foldlist[x]['cancer'] == 0])
        print("fold {} , contains : {} cancers, {} nocancers".format(ifold, num_cancer, num_nocancer))

    write_pkl_file('files/kaggle_training_set_4_fold_list.pkl', foldlist)


if __name__ == '__main__':
    main()
